library(assertr)
Warning: package ‘assertr’ was built under R version 4.2.1
candy_2015 <- read_xlsx("raw_data/boing-boing-candy-2015.xlsx") %>%
clean_names()
candy_2016 <- read_xlsx("raw_data/boing-boing-candy-2016.xlsx") %>%
clean_names()
candy_2017 <- read_xlsx("raw_data/boing-boing-candy-2017.xlsx") %>%
clean_names()
New names:
candy_2015
candy_2016
candy_2017
NA
NA
names_2016
[1] "internal_id" "q1_going_out"
[3] "q2_gender" "q3_age"
[5] "q4_country" "q5_state_province_county_etc"
[7] "q6_100_grand_bar" "q6_anonymous_brown_globs_that_come_in_black_and_orange_wrappers_a_k_a_mary_janes"
[9] "q6_any_full_sized_candy_bar" "q6_black_jacks"
[11] "q6_bonkers_the_candy" "q6_bonkers_the_board_game"
[13] "q6_bottle_caps" "q6_boxo_raisins"
[15] "q6_broken_glow_stick" "q6_butterfinger"
[17] "q6_cadbury_creme_eggs" "q6_candy_corn"
[19] "q6_candy_that_is_clearly_just_the_stuff_given_out_for_free_at_restaurants" "q6_caramellos"
[21] "q6_cash_or_other_forms_of_legal_tender" "q6_chardonnay"
[23] "q6_chick_o_sticks_we_don_t_know_what_that_is" "q6_chiclets"
[25] "q6_coffee_crisp" "q6_creepy_religious_comics_chick_tracts"
[27] "q6_dental_paraphenalia" "q6_dots"
[29] "q6_dove_bars" "q6_fuzzy_peaches"
[31] "q6_generic_brand_acetaminophen" "q6_glow_sticks"
[33] "q6_goo_goo_clusters" "q6_good_n_plenty"
[35] "q6_gum_from_baseball_cards" "q6_gummy_bears_straight_up"
[37] "q6_hard_candy" "q6_healthy_fruit"
[39] "q6_heath_bar" "q6_hersheys_dark_chocolate"
[41] "q6_hershey_s_milk_chocolate" "q6_hersheys_kisses"
[43] "q6_hugs_actual_physical_hugs" "q6_jolly_rancher_bad_flavor"
[45] "q6_jolly_ranchers_good_flavor" "q6_joy_joy_mit_iodine"
[47] "q6_junior_mints" "q6_senior_mints"
[49] "q6_kale_smoothie" "q6_kinder_happy_hippo"
[51] "q6_kit_kat" "q6_laffy_taffy"
[53] "q6_lemon_heads" "q6_licorice_not_black"
[55] "q6_licorice_yes_black" "q6_lindt_truffle"
[57] "q6_lollipops" "q6_mars"
[59] "q6_maynards" "q6_mike_and_ike"
[61] "q6_milk_duds" "q6_milky_way"
[63] "q6_regular_m_ms" "q6_peanut_m_m_s"
[65] "q6_blue_m_ms" "q6_red_m_ms"
[67] "q6_green_party_m_ms" "q6_independent_m_ms"
[69] "q6_abstained_from_m_ming" "q6_minibags_of_chips"
[71] "q6_mint_kisses" "q6_mint_juleps"
[73] "q6_mr_goodbar" "q6_necco_wafers"
[75] "q6_nerds" "q6_nestle_crunch"
[77] "q6_nown_laters" "q6_peeps"
[79] "q6_pencils" "q6_pixy_stix"
[81] "q6_real_housewives_of_orange_county_season_9_blue_ray" "q6_reese_s_peanut_butter_cups"
[83] "q6_reeses_pieces" "q6_reggie_jackson_bar"
[85] "q6_rolos" "q6_sandwich_sized_bags_filled_with_boo_berry_crunch"
[87] "q6_skittles" "q6_smarties_american"
[89] "q6_smarties_commonwealth" "q6_snickers"
[91] "q6_sourpatch_kids_i_e_abominations_of_nature" "q6_spotted_dick"
[93] "q6_starburst" "q6_sweet_tarts"
[95] "q6_swedish_fish" "q6_sweetums_a_friend_to_diabetes"
[97] "q6_take_5" "q6_tic_tacs"
[99] "q6_those_odd_marshmallow_circus_peanut_things" "q6_three_musketeers"
[101] "q6_tolberone_something_or_other" "q6_trail_mix"
[103] "q6_twix" "q6_vials_of_pure_high_fructose_corn_syrup_for_main_lining_into_your_vein"
[105] "q6_vicodin" "q6_whatchamacallit_bars"
[107] "q6_white_bread" "q6_whole_wheat_anything"
[109] "q6_york_peppermint_patties" "q7_joy_other"
[111] "q8_despair_other" "q9_other_comments"
[113] "q10_dress" "x114"
[115] "q11_day" "q12_media_daily_dish"
[117] "q12_media_science" "q12_media_espn"
[119] "q12_media_yahoo" "click_coordinates_x_y"
candy_cleaned_2015 <- candy_2015 %>%
rename(age = how_old_are_you,
going_out = are_you_going_actually_going_trick_or_treating_yourself) %>%
mutate(year = str_extract(timestamp, '[0-9]{1,4}'), .after = timestamp) %>%
mutate(id = row_number(timestamp) + 1e6) %>%
mutate(country = NA_character_, .after = age) %>%
#mutate(is_going_out = ifelse(is_going_out == "Yes",T, F)) %>%
select(id, year:york_peppermint_patties, necco_wafers) %>%
# replace all non integer age inputs as NA, convert values to integers
mutate(age = as.integer(age), year = as.integer(year)) %>%
pivot_longer(butterfinger:necco_wafers, names_to = "candy_name", values_to = "rating") %>%
select(id, year, going_out, age, country, candy_name, rating)
Warning: NAs introduced by coercionWarning: NAs introduced by coercion to integer range
# maybe pivot_longer for combining three datasets
candy_cleaned_2015
candy_cleaned_2016 <- candy_2016 %>%
rename(going_out = are_you_going_actually_going_trick_or_treating_yourself,
age = how_old_are_you,
country = which_country_do_you_live_in,
gender = your_gender) %>%
mutate(id = row_number(timestamp) + 2e6, .before = timestamp) %>%
mutate(year = str_extract(timestamp, '[0-9]{1,4}'), .after = timestamp) %>%
clean_country_names() %>%
select(id, year:york_peppermint_patties, -gender, -which_state_province_county_do_you_live_in) %>%
# replace all non integer age inputs as NA, convert values to integers
mutate(age = as.integer(age), year = as.integer(year)) %>%
#pivot_longer(x100_grand_bar:york_peppermint_patties, names_to = "candy_name", values_to = "rating")
pivot_longer(x100_grand_bar:york_peppermint_patties, names_to = "candy_name", values_to = "rating") %>%
select(id, year, going_out, age, country, candy_name, rating)
Warning: NAs introduced by coercionWarning: NAs introduced by coercion to integer range
candy_cleaned_2016
NA
candy_cleaned_2017 <- candy_2017 %>%
rename(id = internal_id) %>%
# rename(id = internal_id) %>%
pivot_longer(q1_going_out:q11_day, names_to = "col_names", values_to = "value") %>%
select(id, col_names, value) %>%
mutate(col_names = str_remove(col_names, "q[0-9]_")) %>%
pivot_wider(names_from = col_names, values_from = value) %>%
clean_names() %>%
mutate(year = as.integer(2017), .after = id) %>%
mutate(age = as.integer(age)) %>%
clean_country_names() %>%
pivot_longer(x100_grand_bar:york_peppermint_patties, names_to = "candy_name", values_to = "rating") %>%
select(id, year, going_out, age, country, candy_name, rating)
Warning: NAs introduced by coercion
candy_cleaned_2017
Summary of countries No cleaning = 118 unique countries Remove non character values = Lowercase = 96 unique countries
clean_country_names <- function(dataframe){
us_list <- c("merica", "ahemamerca", "alaska", "america","california","murica",
"murrika","newjersey","newyork","northcarolina","pittsburgh",
"trumpistan", "unhingedstates", "uniedstates","unitestates",
"unitesstates", "uniteds", "us", "theunitedstates",
"theunitedstatesofamerica", "unitedsates", "unitedstaes",
"ipretendtobefromcanada,butiamreallyfromtheunitedstates",
"unitedstate", "unitedstatea", "unitedstated", "usofa", "ussa",
"ud", "USA", "sub-canadiannorthamericamerica",
"theyooessofaaayyyyyy", "unitsstates")
uk_list <- c("endland", "england", "scotland", "uk", "unitedkingdom",
"unitedkindom")
canada_list <- c("can", "canada")
exclude_list <- c("a", "canae", "cascadia", "earth", "fearandloathing",
"idontknowanymore", "insanitylately", "namerica", "narnia",
"sovietcanuckistan", "subscribetodmuzonyoutube", "denial",
"godscountry", "neverland", "oneofthebestones", "seeabove",
"somewhere", "eua", "thereisntoneforoldmen",
"therepublicofcascadia", "thisone")
dataframe %>%
mutate(country = str_remove_all(country, "[0-9]*"),
country = str_to_lower(country),
country = str_remove_all(country, "[`.' !?0-9]"),
country = if_else(str_detect(country, "uniteds"),"USA", country),
country = if_else(str_detect(country, "usa"),"USA", country)) %>%
mutate(country = case_when(
country %in% us_list ~ "USA",
country %in% uk_list ~ "UK",
country %in% canada_list ~ "Canada",
country %in% exclude_list ~ NA_character_,
!is.na(country) & country != "" ~ "Other"
), .after = country)
}
candy_cleaned_2015 %>%
bind_rows(candy_cleaned_2016) %>%
bind_rows(candy_cleaned_2017) %>%
distinct(candy_name) %>%
arrange(candy_name)
NA
candy_cleaned_2015
candy_cleaned_2016
candy_cleaned_2017
Analysis questions What is the total number of candy ratings given across the three years. (Number of candy ratings, not the number of raters. Don’t count missing values) #just a list of all the candy ratings What was the average age of people who are going out trick or treating? # age column What was the average age of people who are not going trick or treating? # age column For each of joy, despair and meh, which candy bar received the most of these ratings? # candy ratings How many people rated Starburst as despair? # starburst column (candy columns)
For the next three questions, count despair as -1, joy as +1, and meh as 0.
What was the most popular candy bar by this rating system for each gender in the dataset ? # candy ratings What was the most popular candy bar in each year? # date or year What was the most popular candy bar by this rating for people in US, Canada, UK, and all other countries? # countries